Data Cleaning¶

Task: Merge Redundant Values¶

In [1]:
import pandas as pd
from ast import literal_eval
import warnings
warnings.filterwarnings('ignore')
In [2]:
brightcove_1 = pd.read_csv('brightcove sizes cleaned.csv', encoding='iso-8859-1')
In [3]:
brightcove = brightcove_1.copy()
In [4]:
brightcove.head()
Out[4]:
Unnamed: 0 video_id name created_at master_size hls_renditions_size mp4_renditions_size audio_renditions_size duration created_by folder_id images published_at updated_at updated_by tags created_by_id updated_by_id created_by_email updated_by_email
0 0 6264776212001 CLF 2021-07-22 0.099985 0.293185 0.036815 0.009675 2.615817 {"type":"user","id":"76076235132","email":"EHo... 61f17b499d7e3f03e9fbc2c0 {"thumbnail":{"src":"https://cf-images.us-east... 2022-05-02 18:08:23.995000+00:00 2022-05-02 18:37:42.165000+00:00 {"type":"internal"} [''] 7.607624e+10 NaN EHoff@cw.moneymappress.com NaN
1 1 6268160131001 PLT Takes Out the Technical Trinity 2021-08-16 0.009995 0.206397 0.027653 0.006493 1.762133 {"type":"internal"} 626c69dcb0299755821f4eb3 {"poster":{"src":"https://cf-images.us-east-1.... 2022-05-02 17:32:19.086000+00:00 2022-05-02 17:51:13.050000+00:00 {"type":"internal"} ['mmlive_replays'] NaN NaN NaN NaN
2 2 6284596661001 Garrett Baldwin - REPLAY - 11-30-2021 2021-12-01 0.998009 3.820449 0.494816 0.114247 30.940800 {"type":"user","id":"76076235128","email":"nyt... 61683bcb13798a65ef63a329 {"thumbnail":{"src":"https://cf-images.us-east... 2022-05-02 17:15:37.172000+00:00 2022-05-02 18:22:47.635000+00:00 {"type":"user","id":"76076235127","email":"rsa... ['garrett baldwin'] 7.607624e+10 7.607624e+10 nyteowl9@gmail.com rsandusky@cw.moneymappress.com
3 3 6305270399001 Shah Gilani REPLAY - 27 April 2022 2022-04-27 0.997150 4.092209 0.529739 0.122784 33.143117 {"type":"user","id":"76076235127","email":"rsa... 611160bdbda6bb0af9aa53a3 {"poster":{"src":"https://cf-images.us-east-1.... 2022-05-02 16:31:57.794000+00:00 2022-05-02 17:04:57.053000+00:00 {"type":"internal"} ['shah gilani'] 7.607624e+10 NaN rsandusky@cw.moneymappress.com NaN
4 4 6293084125001 Bobby Eight - 8Ball - 01212022 MML 2022-01-22 0.996883 3.098494 0.405372 0.090642 25.436800 {"type":"user","id":"76076247291","email":"ami... 625eee609de8257ffa4d7472 {"poster":{"src":"https://cf-images.us-east-1.... 2022-05-02 15:31:33.255000+00:00 2022-05-02 16:03:01.164000+00:00 {"type":"user","id":"76076235127","email":"rsa... ['8baller', 'main room', '8ball'] 7.607625e+10 7.607624e+10 amine.cherouati94@gmail.com rsandusky@cw.moneymappress.com
In [5]:
df = brightcove[['duration','tags']].copy()
df['tags'] = df['tags'].apply(literal_eval) #convert to list type
df = df.explode('tags')
In [6]:
df['tags'] = df.tags.str.replace(r'(^.*ball.*$)', '8 baller')
df['tags'] = df.tags.str.replace(r'(^.*1450.*$)', '1450 club')
df['tags'] = df.tags.str.replace(r'(^.*a ndrew keene.*$)', 'andrew keene')
df['tags'] = df.tags.str.replace(r'(^.*ak.*$)', 'akshow')
df['tags'] = df.tags.str.replace(r'(^.*andrew.*$)', 'andrew keene')
df['tags'] = df.tags.str.replace(r'(^.*angels.*$)', 'angels and entrepreneurs')
df['tags'] = df.tags.str.replace(r'(^.*angels and entrepreneurs*$)', 'angels and entreprenuers')
df['tags'] = df.tags.str.replace(r'(^.*baldiwn.*$)', 'baldwin')
df['tags'] = df.tags.str.replace(r'(^.*bobby eight.*$)', 'bobby-eight')
df['tags'] = df.tags.str.replace(r'(^.*brandon-clip.*$)', 'brandon')
df['tags'] = df.tags.str.replace(r'(^.*brian.*$)', 'brian king')
df['tags'] = df.tags.str.replace(r'(^.*brian-clip*$)', 'brian king')
df['tags'] = df.tags.str.replace(r'(^.*brian-king*$)', 'brian king')
df['tags'] = df.tags.str.replace(r'(^.*chris.*$)', 'chris johnson')
df['tags'] = df.tags.str.replace(r'(^.*cj.*$)', 'cj')
df['tags'] = df.tags.str.replace(r'(^.*crupto.*$)', 'crypto')
df['tags'] = df.tags.str.replace(r'(^.*eric.*$)', 'eric hoff')
df['tags'] = df.tags.str.replace(r'(^.*fast.*$)', 'fast fortune')
df['tags'] = df.tags.str.replace(r'(^.*foundations .*$)', 'foundations trading')
df['tags'] = df.tags.str.replace(r'(^.*garrett.*$)', 'garrett baldwin')
df['tags'] = df.tags.str.replace(r'(^.*gbald.*$)', 'garrett baldwin')
df['tags'] = df.tags.str.replace(r'(^.*gbshow.*$)', 'garrett baldwin')
df['tags'] = df.tags.str.replace(r'(^.*hot.*$)', 'hot clip')
df['tags'] = df.tags.str.replace(r'(^.*hot .*$)', 'hot clip')
df['tags'] = df.tags.str.replace(r'(^.*hotclip.*$)', 'hot clip')
df['tags'] = df.tags.str.replace(r'(^.*jared.*$)', 'jared levy')
df['tags'] = df.tags.str.replace(r'(^.*kenny.*$)', 'kenny glick')
df['tags'] = df.tags.str.replace(r'(^.*live.*$)', 'live event')
df['tags'] = df.tags.str.replace(r'(^.*liz.*$)', 'liz-clip')
df['tags'] = df.tags.str.replace(r'(^.*locked.*$)', 'locked & loaded')
df['tags'] = df.tags.str.replace(r'(^.*main.*$)', 'main')
df['tags'] = df.tags.str.replace(r'(^.*mark.*$)', 'mark sebastian')
df['tags'] = df.tags.str.replace(r'(^.*mml.*$)', 'mml')
df['tags'] = df.tags.str.replace(r'(^.*monday.*$)', 'monday')
df['tags'] = df.tags.str.replace(r'(^.*morning.*$)', 'morning')
df['tags'] = df.tags.str.replace(r'(^.*ms.*$)', 'ms')
df['tags'] = df.tags.str.replace(r'(^.*nick.*$)', 'nick black')
df['tags'] = df.tags.str.replace(r'(^.*night.*$)', 'night trader')
df['tags'] = df.tags.str.replace(r'(^.*options.*$)', 'options')
df['tags'] = df.tags.str.replace(r'(^.*profit.*$)', 'profit')
df['tags'] = df.tags.str.replace(r'(^.*project 303.*$)', 'project 303')
df['tags'] = df.tags.str.replace(r'(^.*pump.*$)', 'pump')
df['tags'] = df.tags.str.replace(r'(^.*reg.*$)', 'reg')
df['tags'] = df.tags.str.replace(r'(^.*solid.*$)', 'solid jello')
df['tags'] = df.tags.str.replace(r'(^.*stinger.*$)', 'stinger')
df['tags'] = df.tags.str.replace(r'(^.*tim.*$)', 'tim melvin')
df['tags'] = df.tags.str.replace(r'(^.*tom.*$)', 'tom gentile')
df['tags'] = df.tags.str.replace(r'(^.*trade.*$)', 'trade')
df['tags'] = df.tags.str.replace(r'(^.*unicorn.*$)', 'unicorn hunters')
df['tags'] = df.tags.str.replace(r'(^.*volume.*$)', 'volume')
df['tags'] = df.tags.str.replace(r'(^.*vwap.*$)', 'vwap')
df['tags'] = df.tags.str.replace(r'(^.*warlock.*$)', 'warlock')
df['tags'] = df.tags.str.replace(r'(^.*warlock.*$)', 'warlock')
df['tags'] = df.tags.str.replace(r'(^.*wade.*$)', 'wade')
df['tags'] = df.tags.str.replace(r'(^.*world.*$)', 'world')
In [7]:
df = df.groupby('tags',as_index =False).mean()
df.columns = ['Tags', 'Average Duration']
In [9]:
df.head(20)
Out[9]:
Tags Average Duration
0 16.573326
1 04/28/2021 5.942339
2 04/29/2021 12.041250
3 101s 2.688291
4 10mm_media 19.472717
5 1450 club 64.072744
6 303 31.123025
7 3m 2.627200
8 8 baller 33.659754
9 a & e 27.004800
10 akshow 42.314451
11 ama 3.631283
12 andrew keene 30.725688
13 angels and entreprenuers 28.556359
14 baldwin 31.002667
15 biggest 55.852258
16 bitcoin 49.644926
17 black 0.694050
18 bobby-eight 59.998729
19 boom 3.305600